Project Overview

Priming response (enhanced survival upon secondary infection) has been demonstrated with different routes (septic and oral) in the red flour beetle Tribolium castaneum and shows to have high degree of specificity. Some studies have pointed out the importance of the hosts natural microbiota, suggesting that priming might be partially explained by the presence of commensal bacteria in the gut. Using a well established model system for studying host-parasite interaction and insect immunity, red flour beetle Tribolium castaneum and enthomopathogenic bacterium Bacillus thurigiensis (Bt), we have studied if priming could influence microbiome composition of the beetle larvae. We conducted an experiment using the two established routes of priming in this system: injection with heat-killed Bt and oral via ingestion of filtered sterilized bacterial spore supernatants by beetle larvae, with diverse strains of Bt varying in their ability to induce priming. Microbiota composition was assessed after the priming treatment by deep sequencing of the v1-v2 region of the bacterial 16S rRNA gene.

Sequencing Methodology

For sequencing, variable regions V1 and V2 of the 16S rRNA gene within the DNA samples were amplified using the primer pair 27F-338R in a dual-barcoding approach as per Caporaso et al. 2012. 3.5 µl of cDNA was used for amplification and PCR products were verified using the electrophoresis in agarose gel. PCR products were normalized using the SequalPrep Normalization Plate Kit, pooled equimolarly, and sequenced on the Illumina MiSeq v3 2x300bp. Demultiplexing after sequencing was based on 0 mismatches in the barcode sequences.

Loading R packages:

library(rmarkdown)
library(dada2)
library(ggplot2)
library(phyloseq)
library(DECIPHER)
library(phangorn)
library(ape)
library(decontam)
setwd("/home/shrey/ana_metagenome/Upload/")

Pre-processing the reads:

# Loading the Forward and reverse fastq files for all the samples:
path <- "ngs_data"
fnFs <- sort(list.files(path, pattern="_R1.fastq.gz", full.names = TRUE))
fnRs <- sort(list.files(path, pattern="_R2.fastq.gz", full.names = TRUE))
# Extract sample names:
sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1)

# Plotting the quality profiles of the reads of first 10 samples:
temp <- sample.names[1:8]
names(temp) <- basename(fnFs)[1:8]
plotQualityProfile(fnFs[1:8]) + ggtitle("Forward") + facet_wrap(~file, ncol = 4, labeller=as_labeller(temp))

names(temp) <- basename(fnRs)[1:8]
plotQualityProfile(fnRs[1:8]) + ggtitle("Reverse") + facet_wrap(~file, ncol = 4, labeller=as_labeller(temp))

# Trimming the fastq files:
filtFs <- file.path(path, "filtered", paste0(sample.names, "_F_filt.fastq.gz"))
filtRs <- file.path(path, "filtered", paste0(sample.names, "_R_filt.fastq.gz"))
names(filtFs) <- sample.names
names(filtRs) <- sample.names
out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, trimLeft = c(5,5) , truncLen=c(240,220),
                     maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=TRUE,
                     compress=TRUE, multithread=TRUE)

## Checking quality after trimming the reads:
temp <- sample.names[1:8]
names(temp) <- basename(filtFs)[1:8]
plotQualityProfile(filtFs[1:8]) + ggtitle("Forward") + facet_wrap(~file, ncol = 4, labeller=as_labeller(temp))

names(temp) <- basename(filtRs)[1:8]
plotQualityProfile(filtRs[1:8]) + ggtitle("Reverse") + facet_wrap(~file, ncol = 4, labeller=as_labeller(temp))

# De-replication: 
derepFs <- derepFastq(filtFs)
derepRs <- derepFastq(filtRs)
# Name the derep-class objects by the sample names
names(derepFs) <- sample.names
names(derepRs) <- sample.names

#Learning the error rates:
errF <- learnErrors(filtFs, multithread=TRUE, verbose = F)
## 101048355 total bases in 429993 reads from 9 samples will be used for learning the error rates.
errR <- learnErrors(filtRs, multithread=TRUE, verbose = F)
## 111423535 total bases in 518249 reads from 11 samples will be used for learning the error rates.
plotErrors(errF, nominalQ=TRUE)

plotErrors(errR, nominalQ=TRUE)

Creating ASV table:

#Creating dada objects:
dadaFs <- dada(derepFs, err=errF, multithread=TRUE, verbose = F)
dadaRs <- dada(derepRs, err=errR, multithread=TRUE, verbose = F)

# Merge paired reads
mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose = F)
# Constructing ASV sequence table
seqtab <- makeSequenceTable(mergers)
# Inspect distribution of sequence lengths:
table(nchar(getSequences(seqtab)))
## 
##  235  238  239  244  246  249  251  252  254  258  261  262  263  264  265  266  267  268  269  270  271 
##    1    2    1    1    2    2    1    1    2    1    4    1    1    9   18  352   79 1068  278  340  143 
##  272  273  274  275  276  277  278  279  280  281  282  283  284  285  286  287  288  289  290  291  292 
##  120   15   49    3   31   15   56   39   33   61  106  128   99  179   91  243  120  245  157  195  672 
##  293  294  295  296  297  298  299  300  301  302  303  304  305  306  307  308  309  310  311  312  313 
##  394 2003  721  775  402  571  304  845  183  190  416  494  206  340  359  421  220  266   65   47  128 
##  314  315  316  317  318  319  320  321  322  323  324  325  326  327  328  329  330  331  332  333  334 
##  108   41   60   53  126   76   22   11   52   11    7    1    3    6    8   10    5   22   83  301   45 
##  335  336  337  340  342  352  358  360  371  391 
##    1    1    8    2    3    2    1    7    1    1
# Remove chimeras
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=F)
# Chimeras account for 4.1% of the merged sequence reads:
(1- sum(seqtab.nochim)/sum(seqtab)) *100
## [1] 4.175729
#Tracking reads through the pipeline:
getN <- function(x) sum(getUniques(x))
track <- cbind(out, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers, getN), rowSums(seqtab.nochim))
colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim")
rownames(track) <- sample.names
paged_table(as.data.frame(track))
rm(getN,out,filtFs,filtRs,dadaFs,dadaRs,fnFs,fnRs,derepFs,derepRs,errF,errR,mergers,seqtab,temp)

# Assigning taxonomy using Silva database
taxa <- assignTaxonomy(seqtab.nochim, "database/silva 138/silva_nr99_v138_train_set.fa.gz", 
                       multithread=TRUE, verbose = FALSE)
taxa <- addSpecies(taxa, "database/silva 138/silva_species_assignment_v138.fa.gz")

# making a fasta of ASV seqs:
asv_seqs <- colnames(seqtab.nochim)
asv_headers <- vector(dim(seqtab.nochim)[2], mode="character")
asv_fasta <- asv_seqs
for (i in 1:dim(seqtab.nochim)[2]) {
  asv_headers[i] <- paste(">ASV", i, sep="_")
}
asv_fasta <- c(rbind(asv_headers, asv_seqs))

# Making ASV count table:
asv_tab <- t(seqtab.nochim)
row.names(asv_tab) <- sub(">", "", asv_headers)
asv_tab<-as.matrix(asv_tab)

#Making ASV taxanomy table:
asv_tax <- taxa
row.names(asv_tax) <- sub(">", "", asv_headers)
paged_table(as.data.frame(asv_tax))
# DADA2 complete.

Creating a phyloseq object:

# Reading the table with samples and treatments
sample_table <- as.data.frame(read.csv("Sample_table.csv",sep=";" ))
sample_table <- sample_table[1:90,c(1,4:11)]
rownames(sample_table) <- sample_table$Samples
paged_table(sample_table)
count_phy <- otu_table(asv_tab, taxa_are_rows=T)
sample_table_phy <- sample_data(sample_table)
tax_tab_phy<-tax_table(asv_tax)

# Creating Phylogenetic Tree of ASVs:
# Aligning the sequences:
seqs <-(grep(pattern = ">ASV", asv_fasta,invert = T,value = T))
names(seqs) <- seqs
alignment <- AlignSeqs(DNAStringSet(seqs), anchor=NA, verbose = F)
phang.align <- phyDat(as(alignment, "matrix"), type="DNA")
write.phyDat(phang.align, file="alignment.fasta", format="fasta")
# Constructing the tree using FastTree:
system("./FastTreeMP -gtr -nt alignment.fasta  > gene-tree_all.tre", ignore.stderr = T)
# Importing the tree
fitGTR <- read.tree(file = "gene-tree_all.tre")
tree = phy_tree(fitGTR)
taxa_names(tree) <- sub(">", "", asv_headers)

# Creating a phyloseq object:
ps<- phyloseq(count_phy,sample_table_phy,tax_tab_phy,tree)
ps
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 10933 taxa and 90 samples ]
## sample_data() Sample Data:       [ 90 samples by 9 sample variables ]
## tax_table()   Taxonomy Table:    [ 10933 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 10933 tips and 10931 internal nodes ]
# Plotting the library size:
df <- as.data.frame(sample_data(ps))
df$LibrarySize <- sample_sums(ps)
df <- df[order(df$LibrarySize),]
df$Index <- seq(nrow(df))
ggplot(data=df, aes(x=Index, y=LibrarySize, color=Sample)) + geom_point()

Removing contaminant ASVs:

sample_data(ps)$is.neg <- sample_data(ps)$Sample == "control"
contamdf.prev <- isContaminant(ps, method="prevalence", neg="is.neg")
table(contamdf.prev$contaminant) 
## 
## FALSE  TRUE 
## 10858    75
# threshold=0.5, will identify as contaminants all those ASVs which are more prevalent in negative controls than in the samples.
contamdf.prev05 <- isContaminant(ps, method="prevalence", neg="is.neg", threshold=0.5)
table(contamdf.prev05$contaminant)
## 
## FALSE  TRUE 
## 10655   278
ps.pa <- transform_sample_counts(ps, function(abund) 1*(abund>0))
ps.pa.neg <- prune_samples(sample_data(ps.pa)$Sample == "control", ps.pa)
ps.pa.pos <- prune_samples(sample_data(ps.pa)$Sample == "Sample", ps.pa)
# Dataframe of prevalence in positive and negative samples:
df.pa <- data.frame(pa.pos=taxa_sums(ps.pa.pos), pa.neg=taxa_sums(ps.pa.neg),
                    contaminant=contamdf.prev05$contaminant)
# Plot showing the number of these taxa observed in negative controls and positive samples:
ggplot(data=df.pa, aes(x=pa.neg, y=pa.pos, color=contaminant)) + geom_point() +
  xlab("Number of Negative Controls") + ylab("Number of True Samples") + theme_bw() + 
  labs(color='Contaminants') +
  theme(panel.grid.major = element_blank(), legend.position = c(0.9,0.1),
        legend.background = element_rect(fill=alpha('white', 0)))  + 
  scale_x_continuous(breaks = seq(0, 8, by = 1)) +
  scale_y_continuous(breaks = seq(0, 80, by = 10))+
  scale_fill_viridis_d()

# Removing the contaminations
ps
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 10933 taxa and 90 samples ]
## sample_data() Sample Data:       [ 90 samples by 10 sample variables ]
## tax_table()   Taxonomy Table:    [ 10933 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 10933 tips and 10931 internal nodes ]
ps.noncontam <- prune_taxa(!contamdf.prev05$contaminant, ps)
# Removing the mock community
ps.noncontam <- prune_samples(sample_names(ps.noncontam) != "I21726-L1", ps.noncontam) 
ps.noncontam <- prune_samples(sample_names(ps.noncontam) != "I21743-L1", ps.noncontam)
# Removing the controls out
ps.noncontam <- prune_samples(sample_data(ps.noncontam)$Sample == "Sample", ps.noncontam) 
ps.noncontam
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 10655 taxa and 79 samples ]
## sample_data() Sample Data:       [ 79 samples by 10 sample variables ]
## tax_table()   Taxonomy Table:    [ 10655 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 10655 tips and 10653 internal nodes ]
# Number of features for each phyla
table(tax_table(ps.noncontam)[, "Phylum"], exclude = NULL)
## 
##             Abditibacteriota              Acidobacteriota             Actinobacteriota 
##                           69                          295                         3407 
##               Armatimonadota                 Bacteroidota             Bdellovibrionota 
##                           58                         1174                           66 
##             Campilobacterota                  Chloroflexi                Cyanobacteria 
##                           11                          184                          393 
##             Deferribacterota                 Deinococcota                 Dependentiae 
##                            2                          128                            2 
##             Desulfobacterota              Elusimicrobiota            Entotheonellaeota 
##                           11                            2                            1 
##               Fibrobacterota                   Firmicutes               Fusobacteriota 
##                            5                         1170                           55 
##              Gemmatimonadota              Hydrogenedentes             Latescibacterota 
##                           57                            1                            2 
##            Methylomirabilota                  Myxococcota                 Nitrospirota 
##                            7                          183                           10 
##              Patescibacteria              Planctomycetota               Proteobacteria 
##                           24                           73                         3156 
##                      RCP2-54 SAR324 clade(Marine group B)                Spirochaetota 
##                            2                            4                            4 
##                  Sumerlaeota                 Synergistota            Verrucomicrobiota 
##                            2                            4                           76 
##                        WPS-2                         <NA> 
##                            8                            9
ps.noncontam <- subset_taxa(ps.noncontam, !is.na(Phylum) & !Phylum %in% c("", "uncharacterized"))
# Compute prevalence of each feature:
prevdf = apply(X = otu_table(ps),MARGIN = ifelse(taxa_are_rows(ps), yes = 1, no = 2),
               FUN = function(x){sum(x > 0)})
prevdf = apply(X = otu_table(ps.noncontam),
               MARGIN = ifelse(taxa_are_rows(ps.noncontam), yes = 1, no = 2),
               FUN = function(x){sum(x > 0)})
prevdf = data.frame(Prevalence = prevdf,TotalAbundance = taxa_sums(ps.noncontam),tax_table(ps.noncontam))
# Mean and total prevelance of each feature:
phyla_prevelance <- plyr::ddply(prevdf, "Phylum", 
                                function(df1){cbind(mean(df1$Prevalence),sum(df1$Prevalence))})
colnames(phyla_prevelance) <- c("Phylum", "Mean Prevelance", "Total Prevelance(Sum of Prevelance)")
# Phyla Prevelence:
paged_table(phyla_prevelance)
# Plotting abundance before filtering
ggplot(prevdf, aes(TotalAbundance, Prevalence / nsamples(ps.noncontam),color=Phylum)) +
  geom_hline(yintercept = 0.05, alpha = 0.5, linetype = 2) + geom_point(size = 0.5, alpha = 0.7) +
  scale_x_log10() + xlab("Total Abundance") + ylab("Prevalence [Frac. Samples]") +
  facet_wrap(~Phylum) + theme(legend.position="none",
                              strip.background = element_rect(colour="black",fill="white"),
                              strip.text.x = element_text(margin = margin(0.05,0,0.05,0, "cm")),
                              text = element_text(size = 6),
                              axis.text = element_text(size = 4.5),
                              panel.background = element_rect(fill = "transparent"),
                              panel.grid.minor = element_line(color = "lightgray"),
                              panel.border = element_rect(fill = "transparent",
                                                          color = "black"))+
  scale_fill_viridis_d()

# Defining phylas to filter:
filterPhyla = c("BRC1", "Deferribacteres","Elusimicrobia","Entotheonellaeota","Hydrogenedentes",
                "Latescibacteria","Dependentiae","Cyanobacteria")
# Filter entries with unidentified Phylum.
ps1 = subset_taxa(ps.noncontam, !Phylum %in% filterPhyla)
# Exporting files for MicrobiomeAnalyst analysis:
write.tree(phy = phy_tree(ps1), file = "microbiomeanalyst/genetree_filtered_all.tre", tree.names = TRUE)
write.table(tax_table(ps1),file = "microbiomeanalyst/ASVs_taxonomy_filtered.csv", sep = "\t")
write.table(otu_table(ps1),file = "microbiomeanalyst/ASVs_counts_filtered.csv", sep = "\t")
write.table(sample_data(ps1),file = "microbiomeanalyst/Data_table_filtered.csv", sep = "\t")

**Session Info:**

sessionInfo()
## R version 3.5.0 (2018-04-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.7 LTS
## 
## Matrix products: default
## BLAS: /opt/R/3.5.0/lib/R/lib/libRblas.so
## LAPACK: /opt/R/3.5.0/lib/R/lib/libRlapack.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=de_DE.UTF-8       
##  [4] LC_COLLATE=en_US.UTF-8     LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                  LC_ADDRESS=C              
## [10] LC_TELEPHONE=C             LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] decontam_1.2.1      phangorn_2.5.5      ape_5.5             DECIPHER_2.10.2     RSQLite_2.2.1      
##  [6] Biostrings_2.50.2   XVector_0.22.0      IRanges_2.16.0      S4Vectors_0.20.1    BiocGenerics_0.28.0
## [11] phyloseq_1.26.1     ggplot2_3.3.5       dada2_1.10.1        Rcpp_1.0.7          rmarkdown_2.11     
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-153                bitops_1.0-7                matrixStats_0.61.0         
##  [4] bit64_4.0.5                 RColorBrewer_1.1-2          GenomeInfoDb_1.18.2        
##  [7] bslib_0.3.0                 tools_3.5.0                 utf8_1.2.2                 
## [10] R6_2.5.1                    vegan_2.5-7                 DBI_1.1.1                  
## [13] mgcv_1.8-33                 colorspace_2.0-2            permute_0.9-5              
## [16] ade4_1.7-18                 withr_2.4.2                 tidyselect_1.1.1           
## [19] bit_4.0.4                   compiler_3.5.0              Biobase_2.42.0             
## [22] DelayedArray_0.8.0          labeling_0.4.2              sass_0.4.0                 
## [25] scales_1.1.1                quadprog_1.5-8              stringr_1.4.0              
## [28] digest_0.6.27               Rsamtools_1.34.1            pkgconfig_2.0.3            
## [31] htmltools_0.5.2             highr_0.9                   fastmap_1.1.0              
## [34] rlang_0.4.11                farver_2.1.0                jquerylib_0.1.4            
## [37] generics_0.1.0              hwriter_1.3.2               jsonlite_1.7.2             
## [40] BiocParallel_1.16.6         dplyr_1.0.7                 RCurl_1.98-1.5             
## [43] magrittr_2.0.1              GenomeInfoDbData_1.2.0      biomformat_1.10.1          
## [46] Matrix_1.2-9                munsell_0.5.0               Rhdf5lib_1.4.3             
## [49] fansi_0.5.0                 lifecycle_1.0.0             stringi_1.7.4              
## [52] yaml_2.2.1                  MASS_7.3-54                 SummarizedExperiment_1.12.0
## [55] zlibbioc_1.28.0             rhdf5_2.26.2                plyr_1.8.6                 
## [58] grid_3.5.0                  blob_1.2.2                  crayon_1.4.1               
## [61] lattice_0.20-45             splines_3.5.0               multtest_2.38.0            
## [64] knitr_1.34                  pillar_1.6.2                igraph_1.2.6               
## [67] GenomicRanges_1.34.0        reshape2_1.4.4              codetools_0.2-18           
## [70] fastmatch_1.1-3             glue_1.4.2                  evaluate_0.14              
## [73] ShortRead_1.40.0            latticeExtra_0.6-28         data.table_1.14.0          
## [76] RcppParallel_5.1.4          vctrs_0.3.8                 foreach_1.5.1              
## [79] gtable_0.3.0                purrr_0.3.4                 assertthat_0.2.1           
## [82] cachem_1.0.6                xfun_0.26                   survival_3.2-13            
## [85] tibble_3.1.4                iterators_1.0.13            GenomicAlignments_1.18.1   
## [88] memoise_2.0.0               cluster_2.1.2               ellipsis_0.3.2